NOTE THAT I HAVE NOT WENT THROUGH TO STYLE CODE OR CLEAN UP READABILITY

Complete the following analysis in R and generate an RMarkdown report to show the analysis and results:

Read in the gapminder_clean.csv data as a tibble using read_csv

# gapminder # note the rows column was unlabeled - X1 for now
gapminder <- read_csv("gapminder_clean.csv") %>%
  as_tibble()
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data.

gapminder1962 <- gapminder %>%
  filter(Year == 1962)
ggplot(gapminder1962, aes(`CO2 emissions (metric tons per capita)`, gdpPercap)) +
  geom_point()
## Warning: Removed 151 rows containing missing values (geom_point).

On the filtered data, calculate the correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the correlation and associated p value?

gapminder1962_cor <- cor.test(gapminder1962$`CO2 emissions (metric tons per capita)` , gapminder1962$gdpPercap, use = "complete.obs")
gapminder1962_p <- gapminder1962_cor$p.value %>%
  signif(3)
gapminder1962_c <- gapminder1962_cor$estimate %>%
  round(2)
pearson_output <- paste0("Pearson correlation: ",gapminder1962_c,"\n")
pvalue_output <- paste0("P-value: ", gapminder1962_p)

Pearson correlation: 0.93

P-value: 1.13e-46

On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

years <- gapminder$Year %>% unique()
coors <- c()
for(y in years){
  gapminderyear <- gapminder %>%
    filter(Year == y)
  gap_cor <- cor.test(gapminderyear$`CO2 emissions (metric tons per capita)` , gapminderyear$gdpPercap, use = "complete.obs") %>%
    .$estimate %>%
      round(3)
  cat(paste0(y, " pearson correlation: ",gap_cor,"\n"))
  coors <- c(coors, gap_cor)
}
## 1962 pearson correlation: 0.926
## 1967 pearson correlation: 0.939
## 1972 pearson correlation: 0.843
## 1977 pearson correlation: 0.793
## 1982 pearson correlation: 0.817
## 1987 pearson correlation: 0.81
## 1992 pearson correlation: 0.809
## 1997 pearson correlation: 0.808
## 2002 pearson correlation: 0.801
## 2007 pearson correlation: 0.72
max_year <- years[coors == max(coors)]
max_year_output <- paste0("\n", "Correlation is strongest in ",max_year)

Correlation is strongest in 1967

Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

plot1 <- gapminder %>%
  filter(Year == max_year) %>%
  ggplot(aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap, size = pop, color = continent)) +
  geom_point()
ggplotly(plot1)

Now, without further guidance, use your R Data Science skills (and appropriate statistical tests) to answer the following (use the unfiltered dataset):

1. What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)

# Filter out NAs
gapminder2 <- gapminder %>%
  filter(!is.na('Energy use (kg of oil equivalent per capita)'),
         !is.na(continent)) %>%
  rename(energy = 'Energy use (kg of oil equivalent per capita)' )

# Run anova, printp-value
gapminder_aov <- aov(energy ~ continent, data = gapminder2, na.action = na.omit) %>%
  summary()
# gapminder_aov[[1]]
gapminder_aovP <- gapminder_aov[[1]][1,5]


# Plot boxplots
ggplot(gapminder2, aes(continent, energy)) +
  # geom_violin() +
  geom_boxplot() +
  ylab("Energy use (kg of oil equivalent per capita)")
## Warning: Removed 436 rows containing non-finite values (stat_boxplot).

# answer
energy_output <- paste("A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOV. p-value:", gapminder_aovP)

A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOV. p-value: 8.52700348715528e-39

2. Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)

gapminder1990 <- gapminder %>%
  filter(Year > 1990, (continent == "Asia" | continent == "Europe"))

test_output <- t.test(filter(gapminder1990, continent == "Asia")$`Imports of goods and services (% of GDP)`,
       filter(gapminder1990, continent == "Europe")$`Imports of goods and services (% of GDP)`)
ggplot(gapminder1990, aes(continent, `Imports of goods and services (% of GDP)`)) +
    geom_boxplot()
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

output <- paste("A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test. p-value:", test_output$p.value)

A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test. p-value: 0.177569118980769

3. What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

### Average for each country ###
  gapminder_countries <- gapminder %>%
    group_by(`Country Name`) %>%
    summarise_at(vars(`Population density (people per sq. km of land area)`), list(avg_density = mean)) %>%
    arrange(desc(avg_density))
# print head and plot
  gapminder_countries %>% head()
## # A tibble: 6 × 2
##   `Country Name`       avg_density
##   <chr>                      <dbl>
## 1 Macao SAR, China          14732.
## 2 Monaco                    14090.
## 3 Hong Kong SAR, China       5153.
## 4 Singapore                  4361.
## 5 Gibraltar                  2622.
## 6 Bermuda                    1133.
    plot1 <- ggplot(gapminder_countries, aes(x=`Country Name`, y=avg_density)) +
      geom_col() +
      theme(axis.text.x=element_blank())
    ggplotly(plot1)
## Warning: Removed 9 rows containing missing values (position_stack).
### Rank across each country and plot ##

# make blank df
  gapminder_countries_rank <- gapminder %>%
    mutate(rank = 1) %>%
    filter(rank == 2)
# Rank countries each year
  for(y in years){
    gapminder1 <- gapminder %>%
      filter(Year == y) %>%
      mutate(rank = rank(- `Population density (people per sq. km of land area)`))
    head(gapminder1)
    gapminder_countries_rank <- rbind(gapminder_countries_rank, gapminder1)
  }

# Average for each country
  gapminder_countries_rank <- gapminder_countries_rank %>%
    group_by(`Country Name`) %>%
    summarise_at(vars(rank), list(avg_density_rank = mean)) %>%
    arrange(avg_density_rank)
# print head
  gapminder_countries_rank %>% head()
## # A tibble: 6 × 2
##   `Country Name`       avg_density_rank
##   <chr>                           <dbl>
## 1 Macao SAR, China                  1.5
## 2 Monaco                            1.5
## 3 Hong Kong SAR, China              3.1
## 4 Singapore                         3.9
## 5 Gibraltar                         5  
## 6 Bermuda                           6.2
output <- paste0("A: The highest average across all years is ", gapminder_countries[1,]$`Country Name`,". However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: ", gapminder_countries_rank[1,]$`Country Name`, " and ", gapminder_countries_rank[2,]$`Country Name`)

A: The highest average across all years is Macao SAR, China. However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: Macao SAR, China and Monaco

4. What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?

# List countries, set baseline in 1962
  countries <- gapminder$`Country Name` %>% unique()
  gapminder1962 <- filter(gapminder, Year == 1962, !is.na(`Life expectancy at birth, total (years)`))
   gapminder_exp <- mutate(gapminder, Life_expectancy_vs_1962 = 1) %>%
     filter(Life_expectancy_vs_1962 == 2)

 # Make new dataframe with life exp vs 1962
  for(c in countries){
    exp1962 <- filter(gapminder1962, `Country Name` == c)[1,]
    gapminder_exp1 <- filter(gapminder, `Country Name` == c) %>%
        mutate(Life_expectancy_vs_1962 = `Life expectancy at birth, total (years)` - exp1962$`Life expectancy at birth, total (years)`)
    gapminder_exp <- rbind(gapminder_exp, gapminder_exp1)
  }

# Plot
  plot1 <- gapminder_exp %>%
    ggplot(aes(x = Year, y = Life_expectancy_vs_1962, color = `Country Name`)) +
      geom_line()
  ggplotly(plot1)
# Extract top value
  gapminder_exp_top <- gapminder_exp %>%
    arrange(desc(Life_expectancy_vs_1962))

output <- paste(gapminder_exp_top[1,]$`Country Name`, "has shown the greatest increase in life expectancy.")

Maldives has shown the greatest increase in life expectancy.